# Dependencies
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(bizdays)
## 
## Attaching package: 'bizdays'
## The following object is masked from 'package:stats':
## 
##     offset
library(stringr)
# Create a calendar that lists federal holidays 
bizdays::load_builtin_calendars()
# Using Brazil/ANBIMA because ANBIMA is no longer functional: https://github.com/msperlin/GetTDData/issues/10
# The difference seems to be trivial, as they seem to be the same
cal = bizdays::calendars()[["Brazil/ANBIMA"]]
# 57 features -> 31 features
JPM_balance_sheet = read_csv("../raw_data/Original/BalanceSheetJPM.csv")
## New names:
## Rows: 57 Columns: 72
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (35): ...1, 30 SEP '25, 31 DEC '23, 31 DEC '22, 31 DEC '21, 30 JUN '21, ... num
## (37): 30 JUN '25, 31 MAR '25, 31 DEC '24, 30 SEP '24, 30 JUN '24, 31 MAR...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
JPM_balance_sheet = na.omit(JPM_balance_sheet) 
JPM_balance_sheet = JPM_balance_sheet %>% mutate(across(-1, as.character)) %>%  pivot_longer(
    cols = -1,
    names_to = "Year_Quarter",
    values_to = "value"
  ) %>%
  pivot_wider(
    names_from = 1,  
    values_from = value
  ) %>%
  mutate(Year_Quarter = case_when(
    grepl("SEP", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q3"),
    grepl("JUN", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q2"),
    grepl("MAR", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q1"),
    grepl("DEC", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q4"),
    TRUE ~ Year_Quarter
  ))
JPM_balance_sheet
# YTD
# 73 features -> 41 features
JPM_cash_flow = read_csv("../raw_data/Original/CashFlowJPM.csv") 
## New names:
## Rows: 73 Columns: 75
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (75): ...1, 30 JUN '25, 31 MAR '25, 31 DEC '24, 30 SEP '24, 30 JUN '24, ...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
JPM_cash_flow = na.omit(JPM_cash_flow)

JPM_cash_flow = JPM_cash_flow  %>% mutate(across(-1, as.character)) %>% pivot_longer(
    cols = -1,
    names_to = "Year_Quarter",
    values_to = "value") %>% pivot_wider(
      names_from = 1,  
      values_from = value,
      values_fn = first
      ) %>%
  mutate(Year_Quarter = case_when(
    grepl("SEP", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q3"),
    grepl("JUN", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q2"),
    grepl("MAR", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q1"),
    grepl("DEC", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q4"),
    TRUE ~ Year_Quarter
  )) %>% mutate(across(!Year_Quarter, ~ {
    . %>%
      str_replace_all("\\.", "") %>%  # Remove thousands separator (period)
      str_replace(",", ".") %>%       # Replace decimal comma with period
      as.numeric()
  }))

JPM_cash_flow = JPM_cash_flow %>% arrange(Year_Quarter) %>% slice(5:n()) %>% mutate(
    Year = as.numeric(substr(Year_Quarter, 1, 4))
  ) %>% group_by(Year) %>% mutate(
  across(
    where(is.numeric),
    ~ if_else(
      row_number() == 1,  # if it's Q1 (first in group)
      .x,                 # keep original YTD value
      .x - lag(.x)        # otherwise, subtract previous quarter
      ))
  ) %>% ungroup() %>% select(!Year)

JPM_cash_flow
# No features lost
FED_funds = read_csv("../raw_data/Original/FEDFUNDS.csv") 
## Rows: 856 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (1): FEDFUNDS
## date (1): observation_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
FED_funds = na.omit(FED_funds)

FED_funds = FED_funds %>% 
  mutate(Year_Quarter = paste0(year(observation_date), "-Q", quarter(observation_date))) %>% 
  select(Year_Quarter, FEDFUNDS) %>% 
  group_by(Year_Quarter) %>% 
  summarise(FEDFUNDS = mean(FEDFUNDS, na.rm = TRUE)) %>% 
  ungroup()
FED_funds
# YTD
# 53 features -> 31 features
JPM_income = read_csv("../raw_data/Original/IncomeStatementJPM.csv") 
## New names:
## Rows: 53 Columns: 76
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): ...1 num (75): 30 SEP '25, 30 JUN '25, 31 MAR '25, 31 DEC '24, 30 SEP '24,
## 30 JUN...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
JPM_income = na.omit(JPM_income)

JPM_income = JPM_income %>% mutate(across(-1, as.character)) %>% pivot_longer(
    cols = -1,
    names_to = "Year_Quarter",
    values_to = "value") %>% pivot_wider(
      names_from = 1,  
      values_from = value,
      values_fn = first
      ) %>%
  mutate(Year_Quarter = case_when(
    grepl("SEP", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q3"),
    grepl("JUN", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q2"),
    grepl("MAR", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q1"),
    grepl("DEC", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q4"),
    TRUE ~ Year_Quarter
  ))
JPM_income = JPM_income %>% mutate(across(!Year_Quarter, ~ {
    . %>%
      str_replace_all("\\.", "") %>%  # Remove thousands separator (period)
      str_replace(",", ".") %>%       # Replace decimal comma with period
      as.numeric()
  }))

JPM_income = JPM_income %>% arrange(Year_Quarter) %>% slice(5:n()) %>% mutate(
    Year = as.numeric(substr(Year_Quarter, 1, 4))
  ) %>% group_by(Year) %>% mutate(
  across(
    where(is.numeric),
    ~ if_else(
      row_number() == 1,  # if it's Q1 (first in group)
      .x,                 # keep original YTD value
      .x - lag(.x)        # otherwise, subtract previous quarter
      ))
  ) %>% ungroup() %>% select(!Year)

JPM_income
# 57 features -> 31 features
JPM_balance_sheet = read_csv("../raw_data/Original/BalanceSheetJPM.csv")
## New names:
## Rows: 57 Columns: 72
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (35): ...1, 30 SEP '25, 31 DEC '23, 31 DEC '22, 31 DEC '21, 30 JUN '21, ... num
## (37): 30 JUN '25, 31 MAR '25, 31 DEC '24, 30 SEP '24, 30 JUN '24, 31 MAR...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
JPM_balance_sheet = na.omit(JPM_balance_sheet) 
JPM_balance_sheet = JPM_balance_sheet %>% mutate(across(-1, as.character)) %>%  pivot_longer(
    cols = -1,
    names_to = "Year_Quarter",
    values_to = "value"
  ) %>%
  pivot_wider(
    names_from = 1,  
    values_from = value
  ) %>%
  mutate(Year_Quarter = case_when(
    grepl("SEP", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q3"),
    grepl("JUN", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q2"),
    grepl("MAR", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q1"),
    grepl("DEC", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q4"),
    TRUE ~ Year_Quarter
  ))
JPM_balance_sheet
JPM_ratios = read_csv("../raw_data/Original/RatiosJPM.csv")
## New names:
## Rows: 60 Columns: 101
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (96): ...1, SEP '25, JUN '25, MAR '25, DEC '24, SEP '24, JUN '24, MAR '2... num
## (5): SEP '06, JUN '06, MAR '06, DEC '05, SEP '05
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
JPM_ratios = na.omit(JPM_ratios)
JPM_ratios = JPM_ratios %>% mutate(across(-1, as.character)) %>%  pivot_longer(
    cols = -1,
    names_to = "Year_Quarter",
    values_to = "value"
  ) %>%
  pivot_wider(
    names_from = 1,  
    values_from = value
  ) %>%
  mutate(Year_Quarter = case_when(
    grepl("SEP", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q3"),
    grepl("JUN", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q2"),
    grepl("MAR", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q1"),
    grepl("DEC", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q4"),
    TRUE ~ Year_Quarter
  ))
JPM_key = read_csv("../raw_data/Original/KeyItemsJPM_Interpolated.csv")
## New names:
## Rows: 34 Columns: 71
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (71): ...1, SEP '25, JUN '25, MAR '25, DEC '24, SEP '24, JUN '24, MAR '2...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
JPM_key = na.omit(JPM_key)
JPM_key = JPM_key %>% mutate(across(-1, as.character)) %>%  pivot_longer(
    cols = -1,
    names_to = "Year_Quarter",
    values_to = "value"
  ) %>%
  pivot_wider(
    names_from = 1,  
    values_from = value
  ) %>%
  mutate(Year_Quarter = case_when(
    grepl("SEP", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q3"),
    grepl("JUN", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q2"),
    grepl("MAR", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q1"),
    grepl("DEC", Year_Quarter) ~ paste0("20", sub(".*'(\\d+)", "\\1", Year_Quarter), "-Q4"),
    TRUE ~ Year_Quarter
  ))
HYBond = read_csv("../raw_data/Original/HYBond_interpolated.csv")
## Rows: 7642 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (1): BAMLH0A0HYM2EY
## date (1): observation_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
HYBond = na.omit(HYBond)
HYBond = HYBond %>% rename(Date = observation_date, HY_bond_rate = BAMLH0A0HYM2EY)
HYBond
IgBond = read_csv("../raw_data/Original/IgBond.csv")
## Rows: 7642 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (1): BAMLC0A4CBBBEY
## date (1): observation_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
IgBond = na.omit(IgBond)
IgBond = IgBond %>% rename(Date = observation_date, IG_bond_rate = BAMLC0A4CBBBEY)
IgBond
JPM = read_csv("../raw_data/Original/PriceHistoryJPM.csv")
## Rows: 6501 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): Date
## dbl (10): Price, CVol, Change, % Change, % Return, Total Return (Gross), Ope...
## num  (1): Cumulative Return %
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Fill in dates to identify quarters
JPM = JPM %>% mutate(Date = offset(as.Date("2025-11-02"), -(0:(n()-1)), cal), Year_Quarter = paste0(year(Date), "-Q", quarter(Date)))
JPM = na.omit(JPM %>% select(Date, Year_Quarter, everything())) # Remove rows with NA values (No value generation for now)
# Merge the datasets
JPM = JPM %>%
  inner_join(JPM_balance_sheet, by = "Year_Quarter") %>%
  inner_join(JPM_cash_flow, by = "Year_Quarter") %>%
  inner_join(FED_funds, by = "Year_Quarter") %>%
  inner_join(JPM_income, by = "Year_Quarter") %>%
  inner_join(JPM_ratios, by = "Year_Quarter") %>% 
  inner_join(JPM_key, by = "Year_Quarter") %>%
  inner_join(HYBond, by="Date") %>%
  inner_join(IgBond, by="Date")
  

JPM = na.omit(JPM)
JPM <- JPM %>%
  arrange(Date) %>% 
  mutate(
    # change from yesterday → today
    Percent_change_backward = (Price - lag(Price)) / lag(Price) * 100,

    # change from today → tomorrow
    Percent_change_forward = (lead(Price) - Price) / Price * 100
  )

JPM = JPM %>% select(Date, Year_Quarter, Price, Percent_change_backward, Percent_change_forward, everything()) %>% drop_na()
JPM
bin_return = function(x) {
  case_when(
    x < -10               ~ 1,
    x >= -10 & x < -9     ~ 2,
    x >= -9  & x < -8     ~ 3,
    x >= -8  & x < -7     ~ 4,
    x >= -7  & x < -6     ~ 5,
    x >= -6  & x < -5     ~ 6,
    x >= -5  & x < -4.5   ~ 7,
    x >= -4.5 & x < -4    ~ 8,
    x >= -4   & x < -3.5  ~ 9,
    x >= -3.5 & x < -3    ~ 10,
    x >= -3   & x < -2.5  ~ 11,
    x >= -2.5 & x < -2.05 ~ 12,
    x >= -2.05 & x < -1.85 ~ 13,
    x >= -1.85 & x < -1.65 ~ 14,
    x >= -1.65 & x < -1.45 ~ 15,
    x >= -1.45 & x < -1.25 ~ 16,
    x >= -1.25 & x < -1.05 ~ 17,
    x >= -1.05 & x < -0.95 ~ 18,
    x >= -0.95 & x < -0.85 ~ 19,
    x >= -0.85 & x < -0.75 ~ 20,
    x >= -0.75 & x < -0.65 ~ 21,
    x >= -0.65 & x < -0.55 ~ 22,
    x >= -0.55 & x < -0.45 ~ 23,
    x >= -0.45 & x < -0.35 ~ 24,
    x >= -0.35 & x < -0.25 ~ 25,
    x >= -0.25 & x < -0.15 ~ 26,
    x >= -0.15 & x < -0.05 ~ 27,
    x >= -0.05 & x < 0.05  ~ 28,
    x >= 0.05  & x < 0.15  ~ 29,
    x >= 0.15  & x < 0.25  ~ 30,
    x >= 0.25  & x < 0.35  ~ 31,
    x >= 0.35  & x < 0.45  ~ 32,
    x >= 0.45  & x < 0.55  ~ 33,
    x >= 0.55  & x < 0.65  ~ 34,
    x >= 0.65  & x < 0.75  ~ 35,
    x >= 0.75  & x < 0.85  ~ 36,
    x >= 0.85  & x < 0.95  ~ 37,
    x >= 0.95  & x < 1.05  ~ 38,
    x >= 1.05  & x < 1.25  ~ 39,
    x >= 1.25  & x < 1.45  ~ 40,
    x >= 1.45  & x < 1.65  ~ 41,
    x >= 1.65  & x < 1.85  ~ 42,
    x >= 1.85  & x < 2.05  ~ 43,
    x >= 2.05  & x < 2.55  ~ 44,
    x >= 2.55  & x < 3.05  ~ 45,
    x >= 3.05  & x < 3.55  ~ 46,
    x >= 3.55  & x < 4.05  ~ 47,
    x >= 4.05  & x < 4.55  ~ 48,
    x >= 4.55  & x < 5     ~ 49,
    x >= 5     & x < 6     ~ 50,
    x >= 6     & x < 7     ~ 51,
    x >= 7     & x < 8     ~ 52,
    x >= 8     & x < 9     ~ 53,
    x >= 9     & x < 10    ~ 54,
    x > 10                 ~ 55,
    TRUE ~ NA_integer_
  )
}


JPM = JPM %>% mutate(
  Backward_Bin = bin_return(Percent_change_backward), 
  Forward_Bin  = bin_return(Percent_change_forward)
  )
label = JPM %>% select(Forward_Bin)

JPM <- JPM %>%
  select(
    Date,
    Year_Quarter,
    Price,
    Percent_change_backward,
    Backward_Bin,
    everything(),
    -Forward_Bin,
    -Percent_change_forward
  )
JPM
# Remove European commas and convert everything to numeric decimal, remove Date and Quarter features
JPM <- JPM %>%
  select(-Year_Quarter, -Date) %>%
  mutate(across(where(is.character), ~ {
    . %>%
      str_replace_all("\\.", "") %>%  # Remove thousands separator (period)
      str_replace(",", ".") %>%       # Replace decimal comma with period
      as.numeric()
  }))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(...)`.
## Caused by warning in `Opinion %>% str_replace_all("\\.", "") %>% str_replace(",", ".") %>% as.numeric()`:
## ! NAs introduced by coercion
JPM
# Final datasets
JPM= JPM %>% mutate(index = row_number()) %>% select(index, everything())
JPM
write.csv(JPM, "train.csv", row.names = FALSE)
write.csv(label, "label.csv", row.names = FALSE)